library("tidyr")
library('ggplot2')
library('dplyr')
library("glue")
library('ggVennDiagram')

wkdir = "~/Desktop/GitHub/Obesity/NewExtractions/H9N2/timo_0.01"
setwd(wkdir)
savedir = "~/Desktop/GitHub/Obesity/NewExtractions/H9N2/timo_0.01/Output_Figures"

source("~/Desktop/GitHub/Obesity/NewExtractions/H9N2/FD_functions.R")
diet = c("Obese","Lean","Control")
dietColors = c("#FF9933","#66CCFF","#606060")
names(dietColors) = diet
DietcolScale_fill <- scale_fill_manual(name = "grp",values = dietColors)
DietcolScale <- scale_colour_manual(name = "grp",values = dietColors)

Specifying thresholds and plotting variables

cov_cut = 200
freq_cut = 0.01
pvalcut  = 0.05

ntlist = c("A","C","G","T")
SEGMENTS = c('H9N2_PB2','H9N2_PB1','H9N2_PA','H9N2_HA','H9N2_NP','H9N2_NA','H9N2_MP','H9N2_NS')

#Loading metadata This includes titer and Ct values when applicable. ND indicates qPCR was run with a negative result; 0 indicates plaque assay or HAI was run with a negative result. NA for any values indicate that data was missing. Sacrificed indicates there was no data at that time point because the ferret had already been sacrficied for pathology.

metafile = metafile = "~/Desktop/GitHub/Obesity/NewExtractions/H9N2/H9_Metadata.csv"

meta = read.csv(file=metafile,header=T,sep=",",na.strings = c(''))
meta = filter(meta, resequenced == "yes")

meta$Ct_Mgene = as.numeric(meta$Ct_Mgene)
Warning: NAs introduced by coercion
meta$titer = as.numeric(meta$titer)
Warning: NAs introduced by coercion
meta$log10_titer = as.numeric(meta$log10_titer)
Warning: NAs introduced by coercion
meta$inf_route = factor(meta$inf_route, levels = c("Index","Contact","Aerosol","Control"))

Loading in coverage file & segment size information

cov = read.csv("./avg_coverage/H9N2.coverage.csv", header = TRUE, sep = ",")

seg_sizes = "../SegmentSize.csv"
sizes = read.csv(file=seg_sizes,header=T,sep=",",na.strings = c(''))
GenomeSize = (sizes %>% filter(segment == 'H9N2_GENOME'))$SegmentSize

cov$segment = factor(cov$segment, levels = SEGMENTS)

Checking if data passes thresholds

cov_check = CoverageAcross(cov,cov_cut,40,sizes, wkdir)
Coverage cutoff is: 200x
Percentage covered cutoff is: 40%

Merging coverage check info with the rest of the metadata

meta = merge(meta, cov_check, by.x = c("sample"), by.y = c("name"), all.y = TRUE)

nrow(meta)
[1] 1536
count(meta,quality)

Loading in variant files

varfile = "./varfiles/H9N2.VariantsOnly.0.01.200.csv"

# read and rearrange the data
vars = read.csv(file=varfile,header=T,sep=",",na.strings = c(''))
vars$name = vars$sample

Rearranging variant dataframe

vdf = ArrangeVarWRep(vars)
# already have replicate data in the varfiles from running CompareReps.v2.py script
vdf = vdf[!duplicated(vdf), ] %>% droplevels()
nrow(vdf)
[1] 1781

Filtering variant df with frequency cutoffs

vdf = filter(vdf, minorfreq1 >= freq_cut & 
               minorfreq2 >= freq_cut & 
               minor %in% ntlist &
               major %in% ntlist) %>% 
            droplevels()
# based on MAF study, reps and 0.01% cutoff was best combo
#filter each replicate separately rather than using the average

vdf = vdf[!duplicated(vdf), ] %>% droplevels()
nrow(vdf)
[1] 1702
# does not eliminate any variants here

Filtering variant df by timo binocheck

#vdf$binocheck = factor(vdf$binocheck, levels = c("False","R1","R2","True"))
#vdf = filter(vdf, binocheck != "False") %>% unique()
#nrow(vdf)

# binocheck is highly dependent on the allele frequency threshold used and also relatively conservative
# as a result, ignore this in favor of found in both replicates across ferrets and cohorts - this is more indicative of a real variant than binocheck

Adding metadata

vdf = merge(vdf,meta, by = c("sample","segment"))
vdf = vdf[!duplicated(vdf), ] %>% droplevels()

vdf$segment = factor(vdf$segment, levels = SEGMENTS)

vdf = filter(vdf, inf_route == "Index" | inf_route == "Contact" | inf_route == "Control")
# ignoring aerosol for now

vdf = filter(vdf, !(ferretID == 2232 & inf_route == "Index"))
# since 2232 is both a contact and then an index to another contact, remove the second instance so as not to double count
# aka only consider 2232 as a contact
vdf = filter(vdf, quality == "good")
vdf = vdf[!duplicated(vdf), ] %>% droplevels()

good_names = c(levels(factor(vdf$sample)))
transmission_info = "/Users/marissaknoll/Desktop/GitHub/Obesity/NewExtractions/H9N2/TransmissionPairs.csv"
pairs = read.csv(transmission_info, header = T)
con_change = filter(vdf, stocknt != major) %>%
  filter(major %in% ntlist)
con_change = con_change[!duplicated(con_change), ]
con_change$ntvarpos = paste0(con_change$segment,"_",con_change$ntpos)
consensus = unique(con_change$ntvar)
length(consensus)
[1] 11
vdf$ntvarpos = paste0(vdf$segment,"_",vdf$ntpos)

minorvdf = filter(vdf, !(ntvarpos %in% consensus)) %>% unique()
nrow(vdf) - nrow(minorvdf)
[1] 212

SNV location plots

SNVLocation = ggplot(vdf, aes(x = ntpos, y = ferretID)) +
  geom_point(aes(color = diet, shape = cohort)) +
  facet_grid(inf_route~segment) +
  PlotTheme1 +
  DietcolScale
print(SNVLocation)
ggsave(SNVLocation, file = "SNVLocation.pdf", path = savedir)
Saving 7.29 x 4.51 in image

# ferret 1787 doesn't have any variants??
# Comparing to SNVs found in the stock
# changed to include consensus variants 9/14/23

F17_stock = filter(vdf, DPI == "Stock", cohort == "F17") 
F17_stock_ntvarpos = unique(F17_stock$ntvarpos)
W17_stock = filter(vdf, DPI == "Stock", cohort == "W17")
W17_stock_ntvarpos = unique(W17_stock$ntvarpos)
Sm18_stock = filter(vdf, DPI == "Stock", cohort == "Sm18")
Sm18_stock_ntvarpos = unique(Sm18_stock$ntvarpos)
Sp19_stock = filter(vdf, DPI == "Stock", cohort == "Sp19")
Sp19_stock_ntvarpos = unique(Sp19_stock$ntvarpos)
Sp20_stock = filter(vdf, DPI == "Stock", cohort == "Sp20")
Sp20_stock_ntvarpos = unique(Sp20_stock$ntvarpos)

F17_ferret = filter(vdf , cohort == "F17", inf_route != "Control")
F17_ferret_ntvarpos = unique(F17_ferret$ntvarpos)
W17_ferret = filter(vdf ,cohort == "W17", inf_route != "Control")
W17_ferret_ntvarpos = unique(W17_ferret$ntvarpos)
Sm18_ferret = filter(vdf ,cohort == "Sm18", inf_route != "Control")
Sm18_ferret_ntvarpos = unique(Sm18_ferret$ntvarpos)
Sp19_ferret = filter(vdf ,cohort == "Sp19", inf_route != "Control")
Sp19_ferret_ntvarpos = unique(Sp19_ferret$ntvarpos)
Sp20_ferret = filter(vdf ,cohort == "Sp20", inf_route != "Control")
Sp20_ferret_ntvarpos = unique(Sp20_ferret$ntvarpos)

all_stock_var = rbind(F17_stock, W17_stock, Sm18_stock, Sp19_stock, Sp20_stock) %>% select(sample, ntvarpos, aapos, major, majoraa, minor, minoraa, minorfreq, nonsyn)
write.csv(all_stock_var, "all_stock_var.csv", row.names = FALSE)
F17_shared = F17_ferret %>% filter(ntvarpos %in% F17_stock_ntvarpos) %>% filter((ntvarpos %in% F17_ferret_ntvarpos)) %>% unique()
F17_denovo = F17_ferret %>% filter((ntvarpos %in% F17_ferret_ntvarpos)) %>% filter(!(ntvarpos %in% F17_stock_ntvarpos)) %>% unique()

W17_shared = W17_ferret %>% filter(ntvarpos %in% W17_stock_ntvarpos) %>% filter((ntvarpos %in% W17_ferret_ntvarpos)) %>% unique()
W17_denovo = W17_ferret %>% filter((ntvarpos %in% W17_ferret_ntvarpos)) %>% filter(!(ntvarpos %in% W17_stock_ntvarpos)) %>% unique()

Sm18_shared = Sm18_ferret %>% filter(ntvarpos %in% Sm18_stock_ntvarpos) %>% filter((ntvarpos %in% Sm18_ferret_ntvarpos)) %>% unique()
Sm18_denovo = Sm18_ferret %>% filter((ntvarpos %in% Sm18_ferret_ntvarpos)) %>% filter(!(ntvarpos %in% Sm18_stock_ntvarpos)) %>% unique()

Sp19_shared = Sp19_ferret %>% filter(ntvarpos %in% Sp19_stock_ntvarpos) %>% filter((ntvarpos %in% Sp19_ferret_ntvarpos)) %>% unique()
Sp19_denovo = Sp19_ferret %>% filter((ntvarpos %in% Sp19_ferret_ntvarpos)) %>% filter(!(ntvarpos %in% Sp19_stock_ntvarpos)) %>% unique()

Sp20_shared = Sp20_ferret %>% filter(ntvarpos %in% Sp20_stock_ntvarpos) %>% filter((ntvarpos %in% Sp20_ferret_ntvarpos)) %>% unique()
Sp20_denovo = Sp20_ferret %>% filter((ntvarpos %in% Sp20_ferret_ntvarpos)) %>% filter(!(ntvarpos %in% Sp20_stock_ntvarpos)) %>% unique()

SNV Location compared to stock

StockSharedPlot = ggplot(stock_shared, aes(x = ntpos, y = ferretID)) +
  geom_point(aes(color = diet, shape = cohort), size = 2) +
  facet_grid(inf_route~segment, drop = FALSE) +
  PlotTheme1 +
  DietcolScale +
  ggtitle("SNVs found in stock")
print(StockSharedPlot)
ggsave(StockSharedPlot, file = "StockSharedPlot.pdf", height = 30, width = 15, path = savedir)


FerUniquePlot = ggplot(ferunique, aes(x = ntpos, y = ferretID)) +
  geom_point(aes(color = diet)) +
  facet_grid(inf_route~segment) +
  PlotTheme1 +
  DietcolScale +
  ggtitle("SNVs not found in stock")
print(FerUniquePlot)
ggsave(FerUniquePlot, file = "FerUniquePlot.pdf", path = savedir)
Saving 7.29 x 4.51 in image

Shared de novo SNVS

ferunique$ntvar = paste0(ferunique$segment,"_",ferunique$major,ferunique$ntpos,ferunique$minor)

shared_vars = ferunique %>% 
  group_by(ntvar,ferretID) %>% 
  tally() %>%
  group_by(ntvar) %>% # This is to prevent double counting variants within a same ferret but different dpi
  tally() %>% filter(n > 1) %>% unique()

# 100 variants that are shared between any ferrets
shared_denovominors = unique(shared_vars$ntvar)
df_allshared = filter(ferunique, ntvar %in% shared_denovominors)

Obese- and lean-specific SNVs

o_var = filter(df_allshared, diet == "Obese") 
o_var = unique(o_var$ntvar)

l_var = filter(df_allshared, diet == "Lean") 
l_var = unique(l_var$ntvar)

diet_var <- list(Obese = o_var, Lean = l_var)

# Venn diagram of obese and lean de novo shared SNVs
DietUniqueSNVS = ggVennDiagram(diet_var)
print(DietUniqueSNVS)
ggsave(DietUniqueSNVS, file = "DietUniqueSNVS.pdf", path = savedir)
Saving 7.29 x 4.51 in image

# 100 total - 67 shared between obese and lean, 20 obese specific, 13 lean specific 
lean = df_allshared %>%
  filter(ntvar %in% l_var) %>%
  filter(!(ntvar %in% o_var)) %>% 
  unique()
unique(lean$ntvar)
 [1] "H9N2_HA_G651A"   "H9N2_HA_C802T"   "H9N2_MP_G339A"   "H9N2_PB1_T1604C" "H9N2_PB1_G738A"  "H9N2_HA_C1118T"  "H9N2_PA_G1986A" 
 [8] "H9N2_HA_G808A"   "H9N2_NS_G294A"   "H9N2_NS_G374A"   "H9N2_NS_G660A"   "H9N2_NS_T375A"   "H9N2_HA_A1531T" 
lean$ferretID_var = paste0(lean$ferretID,"_",lean$ntvar)

repeats_lean = lean %>% 
  group_by(ntvar,ferretID) %>% 
  tally() %>%
  group_by(ntvar) %>% # This is to prevent double counting variants within a same ferret but different dpi
  tally() %>% unique()

lean = merge(lean, repeats_lean, by = c("ntvar")) %>% unique()

obese = df_allshared %>% 
  filter(ntvar %in% o_var) %>% 
  filter(!(ntvar %in% l_var)) %>%
  unique()
unique(obese$ntvar)
 [1] "H9N2_HA_A658G"   "H9N2_HA_C383A"   "H9N2_MP_G459A"   "H9N2_NA_G452A"   "H9N2_NA_G72A"    "H9N2_NS_A719G"   "H9N2_PB2_A1351T"
 [8] "H9N2_PB2_A480G"  "H9N2_PB2_A482G"  "H9N2_MP_T444C"   "H9N2_NP_T911C"   "H9N2_PB1_T905C"  "H9N2_PB1_T906C"  "H9N2_PB2_C1928G"
[15] "H9N2_HA_C375T"   "H9N2_PB1_G591A"  "H9N2_PA_C1873T"  "H9N2_NP_C249T"   "H9N2_HA_A747G"   "H9N2_PA_C1782A" 
obese$ferretID_var = paste0(obese$ferretID,"_",obese$ntvar)

repeats_obese = obese %>% 
  group_by(ntvar,ferretID) %>% 
  tally() %>%
  group_by(ntvar) %>% # This is to prevent double counting variants within a same ferret but different dpi
  tally() %>%
  unique()

obese = merge(obese, repeats_obese, by = c("ntvar")) %>% unique()

dietunique = rbind(lean,obese) %>% unique()
dietunique$ferret_num = dietunique$n
dietunique = select(dietunique, !(n))
#had to look up these positions manually
MP_G459A = filter(dietunique, ntvar == "H9N2_MP_G459A") %>% unique()
MP_G459A$nonsyn = "syn"
MP_G459A$aavar = "Q153Q"
MP_T444C = filter(dietunique, ntvar == "H9N2_MP_T444C") %>% unique()
MP_T444C$nonsyn = "syn"
MP_T444C$aavar = "C148C"
MP_G339A = filter(dietunique, ntvar == "H9N2_MP_G339A") %>% unique()
MP_G339A$nonsyn = "syn"
MP_G339A$aavar = "K113K"

MPs = c("H9N2_MP_G459A","H9N2_MP_T444C","H9N2_MP_G339A")
rest = filter(dietunique, !(ntvar %in% MPs)) %>% unique()
dietunique = rbind(rest, MP_G459A,MP_T444C,MP_G339A)

AF and emergence of obese-specific variantss

# What is the AF distribution of obese-specific variants
ggplot(filter(dietunique, diet == "Obese" & nonsyn == "nonsyn" & ferret_num == 2), aes(x = minorfreq)) +
  geom_histogram(binwidth = 0.01) +
  PlotTheme1


ggplot(filter(dietunique, diet == "Obese" & nonsyn == "nonsyn" & ferret_num == 2), aes(x = inf_route, y = minorfreq)) +
  geom_boxplot() +
  #facet_grid(~inf_route) +
  PlotTheme1


# Obese apadtation -> higher AF than non shared?
o_in = filter(dietunique, diet == "Obese" & nonsyn == "nonsyn" & ferret_num == 2 & inf_route == "Index")
o_co = filter(dietunique, diet == "Obese" & nonsyn == "nonsyn" & ferret_num == 2 & inf_route == "Contact")
t.test(o_in$minorfreq, o_co$minorfreq)

    Welch Two Sample t-test

data:  o_in$minorfreq and o_co$minorfreq
t = -1.1364, df = 15.17, p-value = 0.2734
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.07033827  0.02138534
sample estimates:
 mean of x  mean of y 
0.03827363 0.06275010 
# Diet adaptation (lean and obese) -> higher AF than non shared?
ind = filter(dietunique, nonsyn == "nonsyn" & ferret_num == 2 & inf_route == "Index")
#t.test(ind$minorfreq,non_share$minorfreq)
#
# Do they persist
lean2 = ferunique %>% 
  filter(ntvar %in% l_var) %>% 
  filter(!(ntvar %in% o_var)) %>% 
  unique()
lean2$ferretID_var = paste0(lean2$ferretID,"_",lean2$ntvar)

repeats_lean2 = lean2 %>% 
  mutate(count = 1) %>%
  group_by(ntvar,ferretID) %>% mutate(day_num = sum(count)) %>% ungroup()

lean_fers = select(repeats_lean2, ntvar, ferretID) %>% unique() %>% group_by(ntvar) %>% tally()
lean_fers$fer_num = lean_fers$n
lean_fers = select(lean_fers, !(n))
lean_wrep = merge(repeats_lean2, lean_fers, by = "ntvar") %>% unique()

####

obese2 = ferunique %>% 
  filter(ntvar %in% o_var) %>% 
  filter(!(ntvar %in% l_var)) %>%
  unique()
obese2$ferretID_var = paste0(obese2$ferretID,"_",obese2$ntvar)

repeats_obese2 = obese2 %>% 
  mutate(count = 1) %>%
  group_by(ntvar,ferretID) %>% mutate(day_num = sum(count)) %>% ungroup() 
ob_fers = select(repeats_obese2, ntvar, ferretID) %>% unique() %>% group_by(ntvar) %>% tally()
ob_fers$fer_num = ob_fers$n
ob_fers = select(ob_fers, !(n))
obese_wrep = merge(repeats_obese2, ob_fers, by = "ntvar") %>% unique()

dietunique_repeats = rbind(obese_wrep,lean_wrep) %>% unique()
meta_good = filter(meta, quality == "good") %>% select(sample, quality)
dietunique_repeats_zeros = merge(dietunique_repeats, meta_good, all = TRUE) %>% unique()
persistence = ggplot(filter(dietunique_repeats, nonsyn == "nonsyn" & fer_num == 2), aes(x = DPI, y = minorfreq)) +
  geom_point(aes(color = ntvar)) +
  geom_line(aes(group = ntvar)) +
  facet_grid(~ferretID) +
  PlotTheme1
print(persistence)
ggsave(persistence, filename = "persistence.pdf", path = savedir, width = 25, height = 5)

# Emergence
timing = filter(dietunique, diet == "Obese" & nonsyn == "nonsyn" & ferret_num == 2) %>%
  mutate(count = 1) %>% 
  group_by(inf_route, DPI) %>%
  mutate(perday = sum(count)) %>%
  group_by(inf_route) %>% 
  mutate(pergroup = sum(count)) %>%
  mutate(day_ratio = perday / pergroup) %>%
  select(DPI,inf_route, perday,pergroup, day_ratio) %>% unique()

ggplot(timing, aes(x = DPI, y = day_ratio)) +
  geom_col() +
  facet_grid(~inf_route) +
  PlotTheme1


timing_bydiet = filter(dietunique,nonsyn == "nonsyn" & ferret_num == 2) %>%
  mutate(count = 1) %>% 
  group_by(diet,inf_route, DPI) %>%
  mutate(perday = sum(count)) %>%
  group_by(diet,inf_route) %>% 
  mutate(pergroup = sum(count)) %>%
  mutate(day_ratio = perday / pergroup) %>%
  select(DPI,diet,inf_route, perday,pergroup, day_ratio) %>% unique()

ggplot(timing_bydiet, aes(x = DPI, y = day_ratio)) +
  geom_col() +
  facet_grid(diet~inf_route) +
  PlotTheme1

Determining if diet-unique shared variants are transmitted

dietunique_pairs = merge(dietunique, pairs, by = c("ferretID"))

shared = filter(dietunique_pairs, ferret_num > 1)
t = unique(shared$ntvar)

transmitted = data.frame()

for(i in t){
  print(i)
  df = filter(shared, ntvar == i) 
  df1 = df %>% 
  group_by(ntvar,ferretID,pair_numbers) %>% tally() %>% #this is to avoid double counting variants present at multiple DPI in same ferret
  group_by(pair_numbers) %>% tally()
  # here a 2 means that the two ferrets are in the same transmission pair and a 1 indicates different transmission pairs
  df2 = merge(df, df1, by = c("pair_numbers"))
  # add this information back into the dataframe
  df2$transmission = df2$n
  transmitted = rbind(transmitted, df2)
}
[1] "H9N2_NS_A719G"
[1] "H9N2_PB2_A480G"
[1] "H9N2_NA_G72A"
[1] "H9N2_NA_G452A"
[1] "H9N2_HA_C383A"
[1] "H9N2_MP_G459A"
[1] "H9N2_PB2_A1351T"
[1] "H9N2_HA_A658G"
[1] "H9N2_PB1_T905C"
[1] "H9N2_PB2_A482G"
[1] "H9N2_MP_T444C"
[1] "H9N2_NP_T911C"
[1] "H9N2_PB1_T906C"
[1] "H9N2_HA_C802T"
[1] "H9N2_HA_G808A"
[1] "H9N2_PB2_C1928G"
[1] "H9N2_PA_G1986A"
[1] "H9N2_NS_G294A"
[1] "H9N2_HA_G651A"
[1] "H9N2_MP_G339A"
[1] "H9N2_PB1_G738A"
[1] "H9N2_PB1_T1604C"
[1] "H9N2_HA_C1118T"
[1] "H9N2_HA_C375T"
[1] "H9N2_PB1_G591A"
[1] "H9N2_NS_G660A"
[1] "H9N2_NS_G374A"
[1] "H9N2_NS_T375A"
[1] "H9N2_PA_C1873T"
[1] "H9N2_NP_C249T"
[1] "H9N2_HA_A747G"
[1] "H9N2_PA_C1782A"
[1] "H9N2_HA_A1531T"
#formatting stuff
transmitted$transmission = as.character(transmitted$n)
transmitted = transmitted %>% select(!(n))

Removing possible transmissions since they would not be considered recurrent

recurrent = filter(transmitted, transmission == "1")
select(recurrent, ntvar, diet) %>% unique() %>% count(diet)
recurrent_dNdS = group_by(recurrent, diet, nonsyn) %>% tally() %>% 
  pivot_wider(names_from = nonsyn, values_from = n) %>%
  mutate(dNdS = nonsyn/syn)

DietUnique_Recurrent = ggplot(recurrent, 
                              aes(x = ntpos, 
                                  y = factor(segment, 
                                             levels = c('H9N2_NS','H9N2_MP','H9N2_NA','H9N2_NP','H9N2_HA','H9N2_PA','H9N2_PB1','H9N2_PB2')))) +
  geom_point(aes(color = nonsyn, size = 2)) + 
  geom_text(data = filter(recurrent, ferret_num == 2, nonsyn == "nonsyn"), aes(label = aavar, vjust = 2, hjust = 0.5)) +
  xlab("Nucleotide position") +
  ylab("Segment") +
  facet_grid(~diet) +
  PlotTheme1
print(DietUnique_Recurrent)
ggsave(DietUnique_Recurrent, file = "DietUnique_Recurrent.pdf", width = 10, height = 5, path = savedir)

Looking at transmitted minor vars

trans_var = filter(transmitted, transmission == "2")
select(trans_var, ntvar, diet) %>% unique() %>% count(diet)

transmitted_denovo_minors = ggplot(trans_var,aes(x = ntpos,y = factor(segment, 
                                             levels = c('H9N2_NS','H9N2_MP','H9N2_NA','H9N2_NP','H9N2_HA','H9N2_PA','H9N2_PB1','H9N2_PB2')))) +
  geom_point(aes(color = nonsyn, size = 2)) + 
  geom_text(data = filter(trans_var, ferret_num == 2), aes(label = aavar, vjust = 2, hjust = 0.5)) +
  xlab("Nucleotide position") +
  ylab("Segment") +
  facet_grid(pair_numbers+pair_diets~inf_route) +
  PlotTheme2
print(transmitted_denovo_minors)
ggsave(transmitted_denovo_minors, file = "DietUnique_Transmitted.pdf",  width = 7, height = 5, path = savedir)

SNVs shared between diet groups

shared = df_allshared %>% 
  filter(ntvar %in% o_var) %>% 
  filter(ntvar %in% l_var) %>% 
  unique()
shared$ferretID_var = paste0(shared$ferretID,"_",shared$ntvar)

repeats_shared = shared %>% 
  group_by(ntvar,ferretID) %>% 
  tally() %>%
  group_by(ntvar) %>%
  tally()
# this is to make sure I'm not repeatedly counting a variant found in one ferret but multiple days 

shared = merge(shared, repeats_shared, by = c("ntvar")) %>% unique()

SharedPlot = ggplot(shared, 
                    aes(x = ntpos,
                        y = factor(segment, levels = c('H9N2_NS','H9N2_MP','H9N2_NA','H9N2_NP','H9N2_HA','H9N2_PA','H9N2_PB1','H9N2_PB2')))) +
  geom_point(aes(size = n, color = nonsyn)) +
  geom_text(data = filter(shared, n > 4, nonsyn == "nonsyn"), aes(label = aavar, vjust = 2, hjust = 0.5)) +
  ggtitle("Number of samples containing each variant - Shared between diet groups") +
  ylab("Segment") +
  xlab("Nucleotide Position") +
  PlotTheme1
print(SharedPlot)
ggsave(SharedPlot, filename = "SegmentSNVPlot_DietShared.pdf", path = savedir, height = 10, width = 9)

Extracting common nonsynonymous variants shared between diet groups

nonsyns_shared = filter(shared, nonsyn == "nonsyn" & n > 1) %>% 
  ungroup() %>% 
  select(ntvar,aavar,minorfreq,n) %>%
  unique() %>%
  arrange(desc(n))

write.table(nonsyns_shared, "nonsyns_shared.csv", sep = ",", row.names = FALSE)

Are there differences in AF between diet-specific and non-diet-specific shared vars?

ggplot(shared, aes(x = minorfreq)) +
  geom_histogram()

ggplot(transmitted, aes(x = minorfreq)) +
  geom_histogram()


t.test(shared$minorfreq, transmitted$minorfreq)

    Welch Two Sample t-test

data:  shared$minorfreq and transmitted$minorfreq
t = 0.91334, df = 107.91, p-value = 0.3631
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.007424891  0.020114228
sample estimates:
 mean of x  mean of y 
0.04791309 0.04156843 

Are there differences in allele freq within the shared variants?

ggplot(shared, aes(x = minorfreq)) +
  geom_density(aes(group = factor(n, levels = c("2","3","4","5","6","7","8","9","10","22")), 
                     fill = factor(n, levels = c("2","3","4","5","6","7","8","9","10","22")),
                   alpha = 0.2))


select(nonsyns_shared, !minorfreq) %>% unique() %>% ggplot(., aes(x = n)) + geom_histogram(binwidth = 1)

# determining cutoffs for high and low shared

low_shared = filter(shared, n < 5) %>% unique() %>% mutate(cat = "low")
high_shared = filter(shared, n > 5) %>% unique() %>% mutate(cat = "high")
all_shared = rbind(low_shared, high_shared)

ggplot(low_shared, aes(x = minorfreq)) +
  geom_histogram(binwidth = 0.01)


ggplot(high_shared, aes(x = minorfreq)) +
  geom_histogram(binwidth = 0.01)


ggplot(all_shared, aes(x = minorfreq)) +
  geom_density(aes(group = cat, fill = cat), alpha = 0.4)


t.test(low_shared$minorfreq, high_shared$minorfreq)

    Welch Two Sample t-test

data:  low_shared$minorfreq and high_shared$minorfreq
t = 1.9828, df = 265.48, p-value = 0.04842
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 8.386903e-05 2.391033e-02
sample estimates:
 mean of x  mean of y 
0.05023738 0.03824028 

Are there differences in AF between shared and non shared variants?

oneferret = select(ferunique,ntvar, minorfreq, sample) %>% unique() %>% count(ntvar) %>% filter(n == 1) 
oneferret = unique(oneferret$ntvar)
singles = filter(ferunique, ntvar %in% oneferret) %>% unique()

non_share = select(singles, ntvar, aavar, minorfreq) %>% mutate(n = 1)
non_share$cat = "not shared"

ggplot(non_share, aes(x = minorfreq)) +
  geom_histogram(binwidth = 0.01)

all_shared$cat = "shared"

all_shared_sub = select(all_shared, ntvar, aavar, minorfreq, n, cat)

try_all = rbind(all_shared_sub, non_share) %>% unique()

ggplot(try_all, aes(x = minorfreq)) +
  geom_density(aes(group = cat, fill = cat), alpha = 0.4)


t.test(non_share$minorfreq, low_shared$minorfreq)

    Welch Two Sample t-test

data:  non_share$minorfreq and low_shared$minorfreq
t = -3.3303, df = 220.49, p-value = 0.001017
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.029091318 -0.007460915
sample estimates:
 mean of x  mean of y 
0.03196126 0.05023738 
t.test(non_share$minorfreq, high_shared$minorfreq)

    Welch Two Sample t-test

data:  non_share$minorfreq and high_shared$minorfreq
t = -1.5486, df = 255.65, p-value = 0.1227
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.014263679  0.001705645
sample estimates:
 mean of x  mean of y 
0.03196126 0.03824028 

Combining all shared(btw obese and lean) compared to not shared

share_v_noshare_AF = ggplot(try_all, aes(y = minorfreq, x = cat, color = cat)) +
  geom_boxplot(outlier.shape = NA) + 
  #geom_jitter(alpha = 0.3) +
  ylim(0,0.1) +
  PlotTheme1
print(share_v_noshare_AF)
ggsave(share_v_noshare_AF, filename = "share_v_noshare_AF.pdf", path = savedir, height = 5, width = 9)


ggplot(try_all, aes(y = minorfreq, x = cat, color = cat)) +
  geom_violin() +
  PlotTheme1


t.test(non_share$minorfreq, all_shared$minorfreq)

    Welch Two Sample t-test

data:  non_share$minorfreq and all_shared$minorfreq
t = -3.3153, df = 553.28, p-value = 0.0009755
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.020342423 -0.005205575
sample estimates:
 mean of x  mean of y 
0.03196126 0.04473526 

Is there a difference in freq or abundance between obese v lean ferrets of the 67 non-diet-specific shared vars?

# Frequency
ggplot(shared, aes(x = diet, y = minorfreq)) +
  geom_boxplot(aes(color = diet)) +
  DietcolScale +
  PlotTheme2

shared_ln = filter(shared, diet == "Lean")
shared_ob = filter(shared, diet == "Obese")
t.test(shared_ln$minorfreq,shared_ob$minorfreq)

    Welch Two Sample t-test

data:  shared_ln$minorfreq and shared_ob$minorfreq
t = -1.5297, df = 328.34, p-value = 0.1271
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.021343913  0.002670577
sample estimates:
 mean of x  mean of y 
0.04323066 0.05256732 
# Abundance
shared_vars = group_by(shared, ntvar, diet) %>% tally() 

ggplot(shared_vars, aes(x = ntvar, y = n, fill = diet)) +
geom_col(position = "dodge") + 
#facet_grid(~inf_route) +
PlotTheme1 +
DietcolScale_fill


diff_shared_vars = group_by(shared, ntvar, diet) %>% 
  tally() %>% 
  pivot_wider(names_from = diet, values_from = n) %>% 
  mutate(diff = abs(Obese - Lean)) %>% 
  filter(diff > 2) %>%
  pivot_longer(cols = c("Lean", "Obese"), names_to = c("diet"))
  
ggplot(diff_shared_vars, aes(x = ntvar, y = value, fill = diet)) +
geom_col(position = "dodge") +
#facet_grid(~inf_route) +
PlotTheme1 +
DietcolScale_fill

#Is there a difference in AF of the variants found in obese and lean ferrets?

ggplot(shared, aes(x = minorfreq, fill = diet)) +
  geom_histogram(binwidth = 0.01) +
  PlotTheme1 +
  facet_grid(inf_route~diet) +
  DietcolScale_fill


o = filter(shared, inf_route == "Index" & diet == "Obese")
l = filter(shared, inf_route == "Index" & diet == "Lean")
t.test(o$minorfreq, l$minorfreq)

    Welch Two Sample t-test

data:  o$minorfreq and l$minorfreq
t = 1.1423, df = 156.25, p-value = 0.2551
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.003206157  0.011999126
sample estimates:
 mean of x  mean of y 
0.03223299 0.02783651 
#not significantly different
obese_index = filter(ferunique, diet == "Obese" & inf_route == "Index") %>% ungroup()
lean_index = filter(ferunique, diet == "Lean" & inf_route == "Index") %>% ungroup()
t.test(obese_index$minorfreq, lean_index$minorfreq)

    Welch Two Sample t-test

data:  obese_index$minorfreq and lean_index$minorfreq
t = 1.3069, df = 401.02, p-value = 0.192
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.001593520  0.007914183
sample estimates:
 mean of x  mean of y 
0.03037901 0.02721868 
# means are not different

obese_contact = filter(ferunique, diet == "Obese" & inf_route == "Contact") %>% ungroup()
lean_contact = filter(ferunique, diet == "Lean" & inf_route == "Contact") %>% ungroup()
t.test(obese_contact$minorfreq, lean_contact$minorfreq)

    Welch Two Sample t-test

data:  obese_contact$minorfreq and lean_contact$minorfreq
t = 0.91384, df = 328.9, p-value = 0.3615
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.008520561  0.023304378
sample estimates:
 mean of x  mean of y 
0.06523254 0.05784063 
# means are not different

# QQ_Plot: compares the quantiles of two distributions, x =y suggests they are drawn from the same distribution
qqnorm(obese_index$minorfreq, main = "Obese Index - Test of Normal Distribution")

qqnorm(lean_index$minorfreq, main = "Lean Index - Test of Normal Distribution")

# neither distribution is normal
qqplot(obese_index$minorfreq,lean_index$minorfreq, xlab = "Obese Index", ylab = "Lean Index")


qqnorm(obese_contact$minorfreq, main = "Obese Contact - Test of Normal Distribution")

qqnorm(lean_contact$minorfreq, main = "Lean Contact - Test of Normal Distribution")

# neither distribution is normal
qqplot(obese_contact$minorfreq,lean_contact$minorfreq, xlab = "Obese Contact", ylab = "Lean Contact")


# Mann-Whitney-Wilcox test (Mann-Whitney U test): samples are not normally distributed and independent of each other
wilcox.test(obese_index$minorfreq,lean_index$minorfreq)

    Wilcoxon rank sum test with continuity correction

data:  obese_index$minorfreq and lean_index$minorfreq
W = 44444, p-value = 0.07889
alternative hypothesis: true location shift is not equal to 0
wilcox.test(obese_contact$minorfreq,lean_contact$minorfreq)

    Wilcoxon rank sum test with continuity correction

data:  obese_contact$minorfreq and lean_contact$minorfreq
W = 17434, p-value = 0.04737
alternative hypothesis: true location shift is not equal to 0
# distributions are not different

# Kolmogorov-Smirnov test: samples are not normally distributed and independent of each other
# "sensitive to differences in location and shape of the empirical CDFs of the two samples"
ks.test(obese_index$minorfreq,lean_index$minorfreq)

    Asymptotic two-sample Kolmogorov-Smirnov test

data:  obese_index$minorfreq and lean_index$minorfreq
D = 0.091481, p-value = 0.1993
alternative hypothesis: two-sided
ks.test(obese_contact$minorfreq,lean_contact$minorfreq)

    Asymptotic two-sample Kolmogorov-Smirnov test

data:  obese_contact$minorfreq and lean_contact$minorfreq
D = 0.16995, p-value = 0.01315
alternative hypothesis: two-sided
# distributions are not different
---
title: "R Notebook"
output: html_notebook
---

```{r}
library("tidyr")
library('ggplot2')
library('dplyr')
library("glue")
library('ggVennDiagram')

wkdir = "~/Desktop/GitHub/Obesity/NewExtractions/H9N2/timo_0.01"
setwd(wkdir)
savedir = "~/Desktop/GitHub/Obesity/NewExtractions/H9N2/timo_0.01/Output_Figures"

source("~/Desktop/GitHub/Obesity/NewExtractions/H9N2/FD_functions.R")
```

```{r}
diet = c("Obese","Lean","Control")
dietColors = c("#FF9933","#66CCFF","#606060")
names(dietColors) = diet
DietcolScale_fill <- scale_fill_manual(name = "grp",values = dietColors)
DietcolScale <- scale_colour_manual(name = "grp",values = dietColors)
```

# Specifying thresholds and plotting variables
```{r}
cov_cut = 200
freq_cut = 0.01
pvalcut  = 0.05

ntlist = c("A","C","G","T")
SEGMENTS = c('H9N2_PB2','H9N2_PB1','H9N2_PA','H9N2_HA','H9N2_NP','H9N2_NA','H9N2_MP','H9N2_NS')
```

#Loading metadata
This includes titer and Ct values when applicable. ND indicates qPCR was run with a negative result; 0 indicates plaque assay or HAI was run with a negative result. NA for any values indicate that data was missing. Sacrificed indicates there was no data at that time point because the ferret had already been sacrficied for pathology. 
```{r}
metafile = metafile = "~/Desktop/GitHub/Obesity/NewExtractions/H9N2/H9_Metadata.csv"

meta = read.csv(file=metafile,header=T,sep=",",na.strings = c(''))
meta = filter(meta, resequenced == "yes")

meta$Ct_Mgene = as.numeric(meta$Ct_Mgene)
meta$titer = as.numeric(meta$titer)
meta$log10_titer = as.numeric(meta$log10_titer)

meta$inf_route = factor(meta$inf_route, levels = c("Index","Contact","Aerosol","Control"))
```

# Loading in coverage file & segment size information
```{r}
cov = read.csv("./avg_coverage/H9N2.coverage.csv", header = TRUE, sep = ",")

seg_sizes = "../SegmentSize.csv"
sizes = read.csv(file=seg_sizes,header=T,sep=",",na.strings = c(''))
GenomeSize = (sizes %>% filter(segment == 'H9N2_GENOME'))$SegmentSize

cov$segment = factor(cov$segment, levels = SEGMENTS)
```

# Checking if data passes thresholds
```{r}
cov_check = CoverageAcross(cov,cov_cut,40,sizes, wkdir)
```

# Merging coverage check info with the rest of the metadata
```{r}
meta = merge(meta, cov_check, by.x = c("sample"), by.y = c("name"), all.y = TRUE)

nrow(meta)
count(meta,quality)
```

# Loading in variant files
```{r}
varfile = "./varfiles/H9N2.VariantsOnly.0.01.200.csv"

# read and rearrange the data
vars = read.csv(file=varfile,header=T,sep=",",na.strings = c(''))
vars$name = vars$sample
```

# Rearranging variant dataframe
```{r}
vdf = ArrangeVarWRep(vars)
# already have replicate data in the varfiles from running CompareReps.v2.py script
vdf = vdf[!duplicated(vdf), ] %>% droplevels()
nrow(vdf)
```

# Filtering variant df with frequency cutoffs
```{r}
vdf = filter(vdf, minorfreq1 >= freq_cut & 
               minorfreq2 >= freq_cut & 
               minor %in% ntlist &
               major %in% ntlist) %>% 
            droplevels()
# based on MAF study, reps and 0.01% cutoff was best combo
#filter each replicate separately rather than using the average

vdf = vdf[!duplicated(vdf), ] %>% droplevels()
nrow(vdf)
# does not eliminate any variants here
```

# Filtering variant df by timo binocheck
```{r}
#vdf$binocheck = factor(vdf$binocheck, levels = c("False","R1","R2","True"))
#vdf = filter(vdf, binocheck != "False") %>% unique()
#nrow(vdf)

# binocheck is highly dependent on the allele frequency threshold used and also relatively conservative
# as a result, ignore this in favor of found in both replicates across ferrets and cohorts - this is more indicative of a real variant than binocheck
```

# Adding metadata
```{r}
vdf = merge(vdf,meta, by = c("sample","segment"))
vdf = vdf[!duplicated(vdf), ] %>% droplevels()

vdf$segment = factor(vdf$segment, levels = SEGMENTS)

vdf = filter(vdf, inf_route == "Index" | inf_route == "Contact" | inf_route == "Control")
# ignoring aerosol for now

vdf = filter(vdf, !(ferretID == 2232 & inf_route == "Index"))
# since 2232 is both a contact and then an index to another contact, remove the second instance so as not to double count
# aka only consider 2232 as a contact
```

```{r}
vdf = filter(vdf, quality == "good")
vdf = vdf[!duplicated(vdf), ] %>% droplevels()

good_names = c(levels(factor(vdf$sample)))
```

```{r}
transmission_info = "/Users/marissaknoll/Desktop/GitHub/Obesity/NewExtractions/H9N2/TransmissionPairs.csv"
pairs = read.csv(transmission_info, header = T)
```

```{r}
con_change = filter(vdf, stocknt != major) %>%
  filter(major %in% ntlist)
con_change = con_change[!duplicated(con_change), ]
con_change$ntvarpos = paste0(con_change$segment,"_",con_change$ntpos)
consensus = unique(con_change$ntvar)
length(consensus)
```

```{r}
vdf$ntvarpos = paste0(vdf$segment,"_",vdf$ntpos)

minorvdf = filter(vdf, !(ntvarpos %in% consensus)) %>% unique()
nrow(vdf) - nrow(minorvdf)
```

SNV location plots
```{r}
SNVLocation = ggplot(vdf, aes(x = ntpos, y = ferretID)) +
  geom_point(aes(color = diet, shape = cohort)) +
  facet_grid(inf_route~segment) +
  PlotTheme1 +
  DietcolScale
print(SNVLocation)
ggsave(SNVLocation, file = "SNVLocation.pdf", path = savedir)
# ferret 1787 doesn't have any variants??
```

```{r}
# Comparing to SNVs found in the stock
# changed to include consensus variants 9/14/23

F17_stock = filter(vdf, DPI == "Stock", cohort == "F17") 
F17_stock_ntvarpos = unique(F17_stock$ntvarpos)
W17_stock = filter(vdf, DPI == "Stock", cohort == "W17")
W17_stock_ntvarpos = unique(W17_stock$ntvarpos)
Sm18_stock = filter(vdf, DPI == "Stock", cohort == "Sm18")
Sm18_stock_ntvarpos = unique(Sm18_stock$ntvarpos)
Sp19_stock = filter(vdf, DPI == "Stock", cohort == "Sp19")
Sp19_stock_ntvarpos = unique(Sp19_stock$ntvarpos)
Sp20_stock = filter(vdf, DPI == "Stock", cohort == "Sp20")
Sp20_stock_ntvarpos = unique(Sp20_stock$ntvarpos)

F17_ferret = filter(vdf , cohort == "F17", inf_route != "Control")
F17_ferret_ntvarpos = unique(F17_ferret$ntvarpos)
W17_ferret = filter(vdf ,cohort == "W17", inf_route != "Control")
W17_ferret_ntvarpos = unique(W17_ferret$ntvarpos)
Sm18_ferret = filter(vdf ,cohort == "Sm18", inf_route != "Control")
Sm18_ferret_ntvarpos = unique(Sm18_ferret$ntvarpos)
Sp19_ferret = filter(vdf ,cohort == "Sp19", inf_route != "Control")
Sp19_ferret_ntvarpos = unique(Sp19_ferret$ntvarpos)
Sp20_ferret = filter(vdf ,cohort == "Sp20", inf_route != "Control")
Sp20_ferret_ntvarpos = unique(Sp20_ferret$ntvarpos)

all_stock_var = rbind(F17_stock, W17_stock, Sm18_stock, Sp19_stock, Sp20_stock) %>% select(sample, ntvarpos, aapos, major, majoraa, minor, minoraa, minorfreq, nonsyn)
write.csv(all_stock_var, "all_stock_var.csv", row.names = FALSE)
```

```{r}
F17_shared = F17_ferret %>% filter(ntvarpos %in% F17_stock_ntvarpos) %>% filter((ntvarpos %in% F17_ferret_ntvarpos)) %>% unique()
F17_denovo = F17_ferret %>% filter((ntvarpos %in% F17_ferret_ntvarpos)) %>% filter(!(ntvarpos %in% F17_stock_ntvarpos)) %>% unique()

W17_shared = W17_ferret %>% filter(ntvarpos %in% W17_stock_ntvarpos) %>% filter((ntvarpos %in% W17_ferret_ntvarpos)) %>% unique()
W17_denovo = W17_ferret %>% filter((ntvarpos %in% W17_ferret_ntvarpos)) %>% filter(!(ntvarpos %in% W17_stock_ntvarpos)) %>% unique()

Sm18_shared = Sm18_ferret %>% filter(ntvarpos %in% Sm18_stock_ntvarpos) %>% filter((ntvarpos %in% Sm18_ferret_ntvarpos)) %>% unique()
Sm18_denovo = Sm18_ferret %>% filter((ntvarpos %in% Sm18_ferret_ntvarpos)) %>% filter(!(ntvarpos %in% Sm18_stock_ntvarpos)) %>% unique()

Sp19_shared = Sp19_ferret %>% filter(ntvarpos %in% Sp19_stock_ntvarpos) %>% filter((ntvarpos %in% Sp19_ferret_ntvarpos)) %>% unique()
Sp19_denovo = Sp19_ferret %>% filter((ntvarpos %in% Sp19_ferret_ntvarpos)) %>% filter(!(ntvarpos %in% Sp19_stock_ntvarpos)) %>% unique()

Sp20_shared = Sp20_ferret %>% filter(ntvarpos %in% Sp20_stock_ntvarpos) %>% filter((ntvarpos %in% Sp20_ferret_ntvarpos)) %>% unique()
Sp20_denovo = Sp20_ferret %>% filter((ntvarpos %in% Sp20_ferret_ntvarpos)) %>% filter(!(ntvarpos %in% Sp20_stock_ntvarpos)) %>% unique()
```

```{r}
stock_shared = rbind(F17_shared, W17_shared, Sm18_shared, Sp19_shared, Sp20_shared) %>% unique()
stock_shared$aavar = paste0(stock_shared$majoraa,stock_shared$aapos,stock_shared$minoraa)

ferunique = rbind(F17_denovo, W17_denovo, Sm18_denovo, Sp19_denovo, Sp20_denovo) %>% unique
ferunique$aavar = paste0(ferunique$majoraa,ferunique$aapos,ferunique$minoraa)
write.csv(ferunique, "AllVarNotInStock.csv")

dNdS_ferunique = group_by(ferunique, diet, inf_route,ferretID, nonsyn) %>% tally() %>% 
  filter(!(is.na(nonsyn))) %>% 
  pivot_wider(id_cols = c(ferretID, diet,inf_route), names_from = nonsyn, values_from = n) %>%
  mutate(dNdS_per_ferret = nonsyn/syn) %>%
  filter(!(is.na(dNdS_per_ferret))) %>% #ferrets with 0 nonsyn or 0 syn with produce a dNdS = NA, remove these
  group_by(diet,inf_route) %>% mutate(avg_per_diet_infroute = mean(dNdS_per_ferret))

dNdS_ln = filter(dNdS_ferunique, diet == "Lean" & inf_route == "Contact") %>% ungroup()
dNdS_ob = filter(dNdS_ferunique, diet == "Obese" & inf_route == "Contact") %>% ungroup()
t.test(dNdS_ln$dNdS_per_ferret, dNdS_ob$dNdS_per_ferret)

ggplot(dNdS_ferunique, aes(y = dNdS_per_ferret, x = diet)) +
  geom_boxplot(aes(fill = diet)) +
  geom_violin(alpha = 0.5, aes(fill = diet)) +
  facet_grid(~inf_route) +
  DietcolScale_fill +
  PlotTheme1
```

SNV Location compared to stock
```{r}
StockSharedPlot = ggplot(stock_shared, aes(x = ntpos, y = ferretID)) +
  geom_point(aes(color = diet, shape = cohort), size = 2) +
  facet_grid(inf_route~segment, drop = FALSE) +
  PlotTheme1 +
  DietcolScale +
  ggtitle("SNVs found in stock")
print(StockSharedPlot)
ggsave(StockSharedPlot, file = "StockSharedPlot.pdf", height = 30, width = 15, path = savedir)

FerUniquePlot = ggplot(ferunique, aes(x = ntpos, y = ferretID)) +
  geom_point(aes(color = diet)) +
  facet_grid(inf_route~segment) +
  PlotTheme1 +
  DietcolScale +
  ggtitle("SNVs not found in stock")
print(FerUniquePlot)
ggsave(FerUniquePlot, file = "FerUniquePlot.pdf", path = savedir)
```

# Shared de novo SNVS
```{r}
ferunique$ntvar = paste0(ferunique$segment,"_",ferunique$major,ferunique$ntpos,ferunique$minor)

shared_vars = ferunique %>% 
  group_by(ntvar,ferretID) %>% 
  tally() %>%
  group_by(ntvar) %>% # This is to prevent double counting variants within a same ferret but different dpi
  tally() %>% filter(n > 1) %>% unique()

# 100 variants that are shared between any ferrets
shared_denovominors = unique(shared_vars$ntvar)
df_allshared = filter(ferunique, ntvar %in% shared_denovominors)

```

# Obese- and lean-specific SNVs
```{r}
o_var = filter(df_allshared, diet == "Obese") 
o_var = unique(o_var$ntvar)

l_var = filter(df_allshared, diet == "Lean") 
l_var = unique(l_var$ntvar)

diet_var <- list(Obese = o_var, Lean = l_var)

# Venn diagram of obese and lean de novo shared SNVs
DietUniqueSNVS = ggVennDiagram(diet_var)
print(DietUniqueSNVS)
ggsave(DietUniqueSNVS, file = "DietUniqueSNVS.pdf", path = savedir)

# 100 total - 67 shared between obese and lean, 20 obese specific, 13 lean specific 
```

```{r}
lean = df_allshared %>%
  filter(ntvar %in% l_var) %>%
  filter(!(ntvar %in% o_var)) %>% 
  unique()
unique(lean$ntvar)

lean$ferretID_var = paste0(lean$ferretID,"_",lean$ntvar)

repeats_lean = lean %>% 
  group_by(ntvar,ferretID) %>% 
  tally() %>%
  group_by(ntvar) %>% # This is to prevent double counting variants within a same ferret but different dpi
  tally() %>% unique()

lean = merge(lean, repeats_lean, by = c("ntvar")) %>% unique()

obese = df_allshared %>% 
  filter(ntvar %in% o_var) %>% 
  filter(!(ntvar %in% l_var)) %>%
  unique()
unique(obese$ntvar)

obese$ferretID_var = paste0(obese$ferretID,"_",obese$ntvar)

repeats_obese = obese %>% 
  group_by(ntvar,ferretID) %>% 
  tally() %>%
  group_by(ntvar) %>% # This is to prevent double counting variants within a same ferret but different dpi
  tally() %>%
  unique()

obese = merge(obese, repeats_obese, by = c("ntvar")) %>% unique()

dietunique = rbind(lean,obese) %>% unique()
dietunique$ferret_num = dietunique$n
dietunique = select(dietunique, !(n))
```

```{r}
#had to look up these positions manually
MP_G459A = filter(dietunique, ntvar == "H9N2_MP_G459A") %>% unique()
MP_G459A$nonsyn = "syn"
MP_G459A$aavar = "Q153Q"
MP_T444C = filter(dietunique, ntvar == "H9N2_MP_T444C") %>% unique()
MP_T444C$nonsyn = "syn"
MP_T444C$aavar = "C148C"
MP_G339A = filter(dietunique, ntvar == "H9N2_MP_G339A") %>% unique()
MP_G339A$nonsyn = "syn"
MP_G339A$aavar = "K113K"

MPs = c("H9N2_MP_G459A","H9N2_MP_T444C","H9N2_MP_G339A")
rest = filter(dietunique, !(ntvar %in% MPs)) %>% unique()
dietunique = rbind(rest, MP_G459A,MP_T444C,MP_G339A)
```

# AF and emergence of obese-specific variantss
```{r}
# What is the AF distribution of obese-specific variants
ggplot(filter(dietunique, diet == "Obese" & nonsyn == "nonsyn" & ferret_num == 2), aes(x = minorfreq)) +
  geom_histogram(binwidth = 0.01) +
  PlotTheme1

ggplot(filter(dietunique, diet == "Obese" & nonsyn == "nonsyn" & ferret_num == 2), aes(x = inf_route, y = minorfreq)) +
  geom_boxplot() +
  #facet_grid(~inf_route) +
  PlotTheme1

# Obese apadtation -> higher AF than non shared?
o_in = filter(dietunique, diet == "Obese" & nonsyn == "nonsyn" & ferret_num == 2 & inf_route == "Index")
o_co = filter(dietunique, diet == "Obese" & nonsyn == "nonsyn" & ferret_num == 2 & inf_route == "Contact")
t.test(o_in$minorfreq, o_co$minorfreq)

# Diet adaptation (lean and obese) -> higher AF than non shared?
ind = filter(dietunique, nonsyn == "nonsyn" & ferret_num == 2 & inf_route == "Index")
#t.test(ind$minorfreq,non_share$minorfreq)
#
```

```{r}
# Do they persist
lean2 = ferunique %>% 
  filter(ntvar %in% l_var) %>% 
  filter(!(ntvar %in% o_var)) %>% 
  unique()
lean2$ferretID_var = paste0(lean2$ferretID,"_",lean2$ntvar)

repeats_lean2 = lean2 %>% 
  mutate(count = 1) %>%
  group_by(ntvar,ferretID) %>% mutate(day_num = sum(count)) %>% ungroup()

lean_fers = select(repeats_lean2, ntvar, ferretID) %>% unique() %>% group_by(ntvar) %>% tally()
lean_fers$fer_num = lean_fers$n
lean_fers = select(lean_fers, !(n))
lean_wrep = merge(repeats_lean2, lean_fers, by = "ntvar") %>% unique()

####

obese2 = ferunique %>% 
  filter(ntvar %in% o_var) %>% 
  filter(!(ntvar %in% l_var)) %>%
  unique()
obese2$ferretID_var = paste0(obese2$ferretID,"_",obese2$ntvar)

repeats_obese2 = obese2 %>% 
  mutate(count = 1) %>%
  group_by(ntvar,ferretID) %>% mutate(day_num = sum(count)) %>% ungroup() 
ob_fers = select(repeats_obese2, ntvar, ferretID) %>% unique() %>% group_by(ntvar) %>% tally()
ob_fers$fer_num = ob_fers$n
ob_fers = select(ob_fers, !(n))
obese_wrep = merge(repeats_obese2, ob_fers, by = "ntvar") %>% unique()

dietunique_repeats = rbind(obese_wrep,lean_wrep) %>% unique()
```

```{r}
meta_good = filter(meta, quality == "good") %>% select(sample, quality)
dietunique_repeats_zeros = merge(dietunique_repeats, meta_good, all = TRUE) %>% unique()
persistence = ggplot(filter(dietunique_repeats, nonsyn == "nonsyn" & fer_num == 2), aes(x = DPI, y = minorfreq)) +
  geom_point(aes(color = ntvar)) +
  geom_line(aes(group = ntvar)) +
  facet_grid(~ferretID) +
  PlotTheme1
print(persistence)
ggsave(persistence, filename = "persistence.pdf", path = savedir, width = 25, height = 5)
```

```{r}
# Emergence
timing = filter(dietunique, diet == "Obese" & nonsyn == "nonsyn" & ferret_num == 2) %>%
  mutate(count = 1) %>% 
  group_by(inf_route, DPI) %>%
  mutate(perday = sum(count)) %>%
  group_by(inf_route) %>% 
  mutate(pergroup = sum(count)) %>%
  mutate(day_ratio = perday / pergroup) %>%
  select(DPI,inf_route, perday,pergroup, day_ratio) %>% unique()

ggplot(timing, aes(x = DPI, y = day_ratio)) +
  geom_col() +
  facet_grid(~inf_route) +
  PlotTheme1

timing_bydiet = filter(dietunique,nonsyn == "nonsyn" & ferret_num == 2) %>%
  mutate(count = 1) %>% 
  group_by(diet,inf_route, DPI) %>%
  mutate(perday = sum(count)) %>%
  group_by(diet,inf_route) %>% 
  mutate(pergroup = sum(count)) %>%
  mutate(day_ratio = perday / pergroup) %>%
  select(DPI,diet,inf_route, perday,pergroup, day_ratio) %>% unique()

ggplot(timing_bydiet, aes(x = DPI, y = day_ratio)) +
  geom_col() +
  facet_grid(diet~inf_route) +
  PlotTheme1
```

# Determining if diet-unique shared variants are transmitted
```{r}
dietunique_pairs = merge(dietunique, pairs, by = c("ferretID"))

shared = filter(dietunique_pairs, ferret_num > 1)
t = unique(shared$ntvar)

transmitted = data.frame()

for(i in t){
  print(i)
  df = filter(shared, ntvar == i) 
  df1 = df %>% 
  group_by(ntvar,ferretID,pair_numbers) %>% tally() %>% #this is to avoid double counting variants present at multiple DPI in same ferret
  group_by(pair_numbers) %>% tally()
  # here a 2 means that the two ferrets are in the same transmission pair and a 1 indicates different transmission pairs
  df2 = merge(df, df1, by = c("pair_numbers"))
  # add this information back into the dataframe
  df2$transmission = df2$n
  transmitted = rbind(transmitted, df2)
}

#formatting stuff
transmitted$transmission = as.character(transmitted$n)
transmitted = transmitted %>% select(!(n))
```

# Removing possible transmissions since they would not be considered recurrent 
```{r}
recurrent = filter(transmitted, transmission == "1")
select(recurrent, ntvar, diet) %>% unique() %>% count(diet)
recurrent_dNdS = group_by(recurrent, diet, nonsyn) %>% tally() %>% 
  pivot_wider(names_from = nonsyn, values_from = n) %>%
  mutate(dNdS = nonsyn/syn)

DietUnique_Recurrent = ggplot(recurrent, 
                              aes(x = ntpos, 
                                  y = factor(segment, 
                                             levels = c('H9N2_NS','H9N2_MP','H9N2_NA','H9N2_NP','H9N2_HA','H9N2_PA','H9N2_PB1','H9N2_PB2')))) +
  geom_point(aes(color = nonsyn, size = 2)) + 
  geom_text(data = filter(recurrent, ferret_num == 2, nonsyn == "nonsyn"), aes(label = aavar, vjust = 2, hjust = 0.5)) +
  xlab("Nucleotide position") +
  ylab("Segment") +
  facet_grid(~diet) +
  PlotTheme1
print(DietUnique_Recurrent)
ggsave(DietUnique_Recurrent, file = "DietUnique_Recurrent.pdf", width = 10, height = 5, path = savedir)
```
# Looking at transmitted minor vars
```{r}
trans_var = filter(transmitted, transmission == "2")
select(trans_var, ntvar, diet) %>% unique() %>% count(diet)

transmitted_denovo_minors = ggplot(trans_var,aes(x = ntpos,y = factor(segment, 
                                             levels = c('H9N2_NS','H9N2_MP','H9N2_NA','H9N2_NP','H9N2_HA','H9N2_PA','H9N2_PB1','H9N2_PB2')))) +
  geom_point(aes(color = nonsyn, size = 2)) + 
  geom_text(data = filter(trans_var, ferret_num == 2), aes(label = aavar, vjust = 2, hjust = 0.5)) +
  xlab("Nucleotide position") +
  ylab("Segment") +
  facet_grid(pair_numbers+pair_diets~inf_route) +
  PlotTheme2
print(transmitted_denovo_minors)
ggsave(transmitted_denovo_minors, file = "DietUnique_Transmitted.pdf",  width = 7, height = 5, path = savedir)
```

# SNVs shared between diet groups
```{r}
shared = df_allshared %>% 
  filter(ntvar %in% o_var) %>% 
  filter(ntvar %in% l_var) %>% 
  unique()
shared$ferretID_var = paste0(shared$ferretID,"_",shared$ntvar)

repeats_shared = shared %>% 
  group_by(ntvar,ferretID) %>% 
  tally() %>%
  group_by(ntvar) %>%
  tally()
# this is to make sure I'm not repeatedly counting a variant found in one ferret but multiple days 

shared = merge(shared, repeats_shared, by = c("ntvar")) %>% unique()

SharedPlot = ggplot(shared, 
                    aes(x = ntpos,
                        y = factor(segment, levels = c('H9N2_NS','H9N2_MP','H9N2_NA','H9N2_NP','H9N2_HA','H9N2_PA','H9N2_PB1','H9N2_PB2')))) +
  geom_point(aes(size = n, color = nonsyn)) +
  geom_text(data = filter(shared, n > 4, nonsyn == "nonsyn"), aes(label = aavar, vjust = 2, hjust = 0.5)) +
  ggtitle("Number of samples containing each variant - Shared between diet groups") +
  ylab("Segment") +
  xlab("Nucleotide Position") +
  PlotTheme1
print(SharedPlot)
ggsave(SharedPlot, filename = "SegmentSNVPlot_DietShared.pdf", path = savedir, height = 10, width = 9)
```

# Extracting common nonsynonymous variants shared between diet groups
```{r}
nonsyns_shared = filter(shared, nonsyn == "nonsyn" & n > 1) %>% 
  ungroup() %>% 
  select(ntvar,aavar,minorfreq,n) %>%
  unique() %>%
  arrange(desc(n))

write.table(nonsyns_shared, "nonsyns_shared.csv", sep = ",", row.names = FALSE)
```

# Are there differences in AF between diet-specific and non-diet-specific shared vars?
```{r}
ggplot(shared, aes(x = minorfreq)) +
  geom_histogram()
ggplot(transmitted, aes(x = minorfreq)) +
  geom_histogram()

t.test(shared$minorfreq, transmitted$minorfreq)
```

# Are there differences in allele freq within the shared variants?
```{r}
ggplot(shared, aes(x = minorfreq)) +
  geom_density(aes(group = factor(n, levels = c("2","3","4","5","6","7","8","9","10","22")), 
                     fill = factor(n, levels = c("2","3","4","5","6","7","8","9","10","22")),
                   alpha = 0.2))

select(nonsyns_shared, !minorfreq) %>% unique() %>% ggplot(., aes(x = n)) + geom_histogram(binwidth = 1)
# determining cutoffs for high and low shared

low_shared = filter(shared, n < 5) %>% unique() %>% mutate(cat = "low")
high_shared = filter(shared, n > 5) %>% unique() %>% mutate(cat = "high")
all_shared = rbind(low_shared, high_shared)

ggplot(low_shared, aes(x = minorfreq)) +
  geom_histogram(binwidth = 0.01)

ggplot(high_shared, aes(x = minorfreq)) +
  geom_histogram(binwidth = 0.01)

ggplot(all_shared, aes(x = minorfreq)) +
  geom_density(aes(group = cat, fill = cat), alpha = 0.4)

t.test(low_shared$minorfreq, high_shared$minorfreq)
```

# Are there differences in AF between shared and non shared variants?
```{r}
oneferret = select(ferunique,ntvar, minorfreq, sample) %>% unique() %>% count(ntvar) %>% filter(n == 1) 
oneferret = unique(oneferret$ntvar)
singles = filter(ferunique, ntvar %in% oneferret) %>% unique()

non_share = select(singles, ntvar, aavar, minorfreq) %>% mutate(n = 1)
non_share$cat = "not shared"

ggplot(non_share, aes(x = minorfreq)) +
  geom_histogram(binwidth = 0.01)
all_shared$cat = "shared"

all_shared_sub = select(all_shared, ntvar, aavar, minorfreq, n, cat)

try_all = rbind(all_shared_sub, non_share) %>% unique()

ggplot(try_all, aes(x = minorfreq)) +
  geom_density(aes(group = cat, fill = cat), alpha = 0.4)

t.test(non_share$minorfreq, low_shared$minorfreq)
t.test(non_share$minorfreq, high_shared$minorfreq)
```

# Combining all shared(btw obese and lean) compared to not shared
```{r}
share_v_noshare_AF = ggplot(try_all, aes(y = minorfreq, x = cat, color = cat)) +
  geom_boxplot(outlier.shape = NA) + 
  #geom_jitter(alpha = 0.3) +
  ylim(0,0.1) +
  PlotTheme1
print(share_v_noshare_AF)
ggsave(share_v_noshare_AF, filename = "share_v_noshare_AF.pdf", path = savedir, height = 5, width = 9)

ggplot(try_all, aes(y = minorfreq, x = cat, color = cat)) +
  geom_violin() +
  PlotTheme1

t.test(non_share$minorfreq, all_shared$minorfreq)
```

# Is there a difference in freq or abundance between obese v lean ferrets of the 67 non-diet-specific shared vars?
```{r}
# Frequency
ggplot(shared, aes(x = diet, y = minorfreq)) +
  geom_boxplot(aes(color = diet)) +
  DietcolScale +
  PlotTheme2
shared_ln = filter(shared, diet == "Lean")
shared_ob = filter(shared, diet == "Obese")
t.test(shared_ln$minorfreq,shared_ob$minorfreq)

# Abundance
shared_vars = group_by(shared, ntvar, diet) %>% tally() 

ggplot(shared_vars, aes(x = ntvar, y = n, fill = diet)) +
geom_col(position = "dodge") + 
#facet_grid(~inf_route) +
PlotTheme1 +
DietcolScale_fill

diff_shared_vars = group_by(shared, ntvar, diet) %>% 
  tally() %>% 
  pivot_wider(names_from = diet, values_from = n) %>% 
  mutate(diff = abs(Obese - Lean)) %>% 
  filter(diff > 2) %>%
  pivot_longer(cols = c("Lean", "Obese"), names_to = c("diet"))
  
ggplot(diff_shared_vars, aes(x = ntvar, y = value, fill = diet)) +
geom_col(position = "dodge") +
#facet_grid(~inf_route) +
PlotTheme1 +
DietcolScale_fill
```

#Is there a difference in AF of the variants found in obese and lean ferrets?
```{r}
ggplot(shared, aes(x = minorfreq, fill = diet)) +
  geom_histogram(binwidth = 0.01) +
  PlotTheme1 +
  facet_grid(inf_route~diet) +
  DietcolScale_fill

o = filter(shared, inf_route == "Index" & diet == "Obese")
l = filter(shared, inf_route == "Index" & diet == "Lean")
t.test(o$minorfreq, l$minorfreq)
#not significantly different
```

```{r}
obese_index = filter(ferunique, diet == "Obese" & inf_route == "Index") %>% ungroup()
lean_index = filter(ferunique, diet == "Lean" & inf_route == "Index") %>% ungroup()
t.test(obese_index$minorfreq, lean_index$minorfreq)
# means are not different

obese_contact = filter(ferunique, diet == "Obese" & inf_route == "Contact") %>% ungroup()
lean_contact = filter(ferunique, diet == "Lean" & inf_route == "Contact") %>% ungroup()
t.test(obese_contact$minorfreq, lean_contact$minorfreq)
# means are not different

# QQ_Plot: compares the quantiles of two distributions, x =y suggests they are drawn from the same distribution
qqnorm(obese_index$minorfreq, main = "Obese Index - Test of Normal Distribution")
qqnorm(lean_index$minorfreq, main = "Lean Index - Test of Normal Distribution")
# neither distribution is normal
qqplot(obese_index$minorfreq,lean_index$minorfreq, xlab = "Obese Index", ylab = "Lean Index")

qqnorm(obese_contact$minorfreq, main = "Obese Contact - Test of Normal Distribution")
qqnorm(lean_contact$minorfreq, main = "Lean Contact - Test of Normal Distribution")
# neither distribution is normal
qqplot(obese_contact$minorfreq,lean_contact$minorfreq, xlab = "Obese Contact", ylab = "Lean Contact")

# Mann-Whitney-Wilcox test (Mann-Whitney U test): samples are not normally distributed and independent of each other
wilcox.test(obese_index$minorfreq,lean_index$minorfreq)
wilcox.test(obese_contact$minorfreq,lean_contact$minorfreq)
# distributions are not different

# Kolmogorov-Smirnov test: samples are not normally distributed and independent of each other
# "sensitive to differences in location and shape of the empirical CDFs of the two samples"
ks.test(obese_index$minorfreq,lean_index$minorfreq)
ks.test(obese_contact$minorfreq,lean_contact$minorfreq)
# distributions are not different
```

